---
title: "2025 Regressions"
output: html_document
date: "2025-05-01"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
rm(list = ls())
cat("\014")
```

```{r}
library(vroom)
library(tidyverse)
library(lmtest)
library(sandwich)
library(plm)
library(ggthemes)
library(broom)
library(glmmTMB)
library(fixest)


options(scipen = 999)
```

*Load Data*

*All public and private Y02 patents granted to US inventors at USPTO*

```{r}
data <- vroom("data/Y02_data_2025.csv")

```

*How many patents?*
```{r}
data <- data %>%
  #filter(is.na(fedagency_name)) %>%
  filter(patent_year >= 2000 & patent_year <= 2024)


n_distinct(data$patent_id)
```

```{r}
data <- data %>%
  filter(cpc_subclass == "Y02E")
```




```{r}
gov <- data %>%
  filter(!is.na(level_one)) 

n_distinct(gov$patent_id)
```

```{r}
16014 / 176531
```

```{r}
#Distinct combinations of patent id and cpc_group to avoid duplicates 
data <- data %>%
    distinct(patent_id, cpc_group, .keep_all = T)
```

*I aggregate patents at the lowest technology category level -- cpc-group*

```{r}
gov <- data %>%
  filter(!is.na(fedagency_name)) %>%
  group_by(patent_year, cpc_group) %>%
  summarise(gov_count = n_distinct(patent_id)) %>%
  mutate(patent_year = as.integer(patent_year)) 

us <- data %>%
  filter(is.na(fedagency_name)) %>%
  group_by(patent_year, cpc_group) %>%
  summarise(us_count = n_distinct(patent_id)) %>%
  mutate(patent_year = as.integer(patent_year))
```


```{r}
dt <- left_join(us, gov, by = c("patent_year", "cpc_group"))

dt[is.na(dt)] <- 0 

frame <- expand.grid(
  patent_year = 2000:2024,
  cpc_group = unique(dt$cpc_group)
)

frame <- left_join(frame, dt, by = c("patent_year", "cpc_group"))

frame[is.na(frame)] <- 0 
```

```{r}
patent_data <- frame

patent_data$year <- as.factor(patent_data$patent_year)
patent_data$technology_class <- as.factor(patent_data$cpc_group)

model.data <- patent_data
```



*Base Model*

```{r}

base <- lm(log(us_count + 1) ~ log(gov_count + 1) + technology_class + year, data = model.data)

summary(base)
nobs(base)
```

*Clustered SEs*

```{r}

coeftest(base, vcov = vcovCL(base, cluster = ~technology_class + year))

```

0.5228578  0.1244723



*Switching the DV*

```{r}
model1 <- lm(log(gov_count + 1) ~ log(us_count + 1) + technology_class + factor(year), data = model.data)

# Get robust standard errors
coeftest(model1, vcov = vcovHC(model1, type = "HC1"))
```

0.28004141  0.01057410



*Does this effect persist?*

*Distributed Lag Model*

```{r}

model.data <- model.data %>%
  dplyr::arrange(cpc_group, patent_year) %>%
  dplyr::group_by(cpc_group) %>%
  dplyr::mutate(
    # Government patent count lags
    gov_count_lag0 = gov_count,
    gov_count_lag1 = dplyr::lag(gov_count, 1),
    gov_count_lag2 = dplyr::lag(gov_count, 2),
    gov_count_lag3 = dplyr::lag(gov_count, 3),
    gov_count_lag4 = dplyr::lag(gov_count, 4), 
    gov_count_lag5 = dplyr::lag(gov_count, 5), 
    gov_count_lag6 = dplyr::lag(gov_count, 6), 
    gov_count_lag7 = dplyr::lag(gov_count, 7), 
    gov_count_lag8 = dplyr::lag(gov_count, 8), 
    gov_count_lag9 = dplyr::lag(gov_count, 9), 
    gov_count_lag10 = dplyr::lag(gov_count, 10),
  ) %>%
  dplyr::ungroup()


```

*10 - Lag Model* 

```{r}
dlm_model <- lm(
  log(us_count + 1) ~ log(gov_count_lag0 + 1) +
                      log(gov_count_lag1 + 1) +
                      log(gov_count_lag2 + 1) +
                      log(gov_count_lag3 + 1) +
                      log(gov_count_lag4 + 1) +
                      log(gov_count_lag5 + 1) +
                      log(gov_count_lag6 + 1) +
                      log(gov_count_lag7 + 1) +
                      log(gov_count_lag8 + 1) +
                      log(gov_count_lag9 + 1) +
                      log(gov_count_lag10 + 1) +
                      technology_class + year,
  data = model.data
)

summary(dlm_model)
nobs(dlm_model)
```


#cumulative effect 

```{r}

coefs <- coef(dlm_model)
vcov_matrix <- vcovHC(dlm_model, type = "HC1")

lags_0_5 <- paste0("log(gov_count_lag", 0:5, " + 1)")

cum_effect <- function(lags, coefs, vcov_matrix) {
  beta <- sum(coefs[lags])
  se <- sqrt(sum(vcov_matrix[lags, lags]))
  ci_lower <- beta - 1.96 * se
  ci_upper <- beta + 1.96 * se
  return(data.frame(
    cumulative_effect = beta,
    std_error = se,
    ci_lower = ci_lower,
    ci_upper = ci_upper
  ))
}

cum_5 <- cum_effect(lags_0_5, coefs, vcov_matrix)

cum_5

```



*AIC / BIC of different lag structures* 


```{r}
# Create a data frame to store AIC and BIC results
aic_bic_results <- data.frame(
  lags = integer(),
  AIC = numeric(),
  BIC = numeric()
)

# Loop over 0 to 10 lags
for (k in 0:10) {
  # Construct the formula dynamically
  lag_terms <- paste0("log(gov_count_lag", 0:k, " + 1)", collapse = " + ")
  full_formula <- as.formula(paste0(
    "log(us_count + 1) ~ ", lag_terms, " + technology_class + year"
  ))
  
  # Fit the model
  model_k <- lm(full_formula, data = model.data)
  
  # Store AIC and BIC
  aic_bic_results <- rbind(aic_bic_results, data.frame(
    lags = k,
    AIC = AIC(model_k),
    BIC = BIC(model_k)
  ))
}

# View the result
print(aic_bic_results)

```

```{r}
library(knitr)
kable(aic_bic_results, caption = "Model Fit by Lag Structure (OLS)", digits = 2)

```



*Clustered SEs*
```{r}

coeftest(dlm_model, vcov = vcovCL(dlm_model, cluster = ~technology_class))

```


*Cumulative Effect*
```{r}

# Get robust (e.g., HC1) variance-covariance matrix
robust_vcov <- vcovHC(dlm_model, type = "HC1")

# Extract coefficients and variance-covariance matrix for lag terms
lag_vars <- c("log(gov_count_lag0 + 1)", "log(gov_count_lag1 + 1)",
              "log(gov_count_lag2 + 1)", "log(gov_count_lag3 + 1)",
              "log(gov_count_lag4 + 1)", "log(gov_count_lag5 + 1)")

# Get the coefficient estimates for lag terms
lag_coefs <- coef(dlm_model)[lag_vars]

# Sum of coefficients: cumulative effect
cumulative_effect <- sum(lag_coefs)

# Calculate the standard error of the sum using delta method:
# Var(sum) = 1' * V * 1, where V is the submatrix of the vcov for lag terms
lag_vcov <- robust_vcov[lag_vars, lag_vars]
cumulative_se <- sqrt(sum(lag_vcov))  # since all off-diagonal elements are included

# 95% confidence interval
lower_bound <- cumulative_effect - 1.96 * cumulative_se
upper_bound <- cumulative_effect + 1.96 * cumulative_se

# Print result
cat("Cumulative Effect:", round(cumulative_effect, 4), "\n")
cat("Standard Error:", round(cumulative_se, 4), "\n")
cat("95% CI: [", round(lower_bound, 4), ",", round(upper_bound, 4), "]\n")

```



```{r}

# Extract all coefficient names
coef_names <- names(coef(dlm_model))

# Match all log-transformed lag terms: "log(gov_count_lag0 + 1)", ..., "log(gov_count_lag10 + 1)"
lag_indices <- grep("log\\(gov_count_lag[0-9]+ \\+ 1\\)", coef_names)
lag_names <- coef_names[lag_indices]

# Extract the lag numbers for labeling
lags <- as.integer(gsub(".*lag([0-9]+).*", "\\1", lag_names))

# Extract coefficients and robust SEs
vcov_hc <- vcovHC(dlm_model, type = "HC1")
coefs <- coef(dlm_model)[lag_names]
ses <- sqrt(diag(vcov_hc)[lag_names])

# Create data frame for plotting
df_plot <- data.frame(
  lag = lags,
  coef = coefs,
  se = ses
) %>%
  arrange(lag) %>%
  mutate(
    lower = coef - 1.96 * se,
    upper = coef + 1.96 * se
  )

# Plot
p <- ggplot(df_plot, aes(x = lag, y = coef)) +
  geom_point(size = 2) +
  geom_line() +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, color = "gray40") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
  scale_x_continuous(breaks = 0:10) +  # <-- Force integer ticks from 0 to 10
  labs(
    title = "Marginal Contribution of Each Log Lag Term",
    x = "Lag (years)",
    y = "Coefficient (log-log scale)"
  ) +
  theme_clean(base_size = 14) +
  expand_limits(y = c(min(df_plot$lower) - 0.05, max(df_plot$upper) + 0.05))


# Print
print(p)

# Save the plot
ggsave("log_+_1.png", plot = p, width = 8, height = 8/1.618, units = "in")

```



*Poisson Model*

```{r}
ppml_model <- fepois(
  us_count ~ gov_count_lag0 +
             gov_count_lag1 +
             gov_count_lag2 +
             gov_count_lag3 +
             gov_count_lag4 +
             gov_count_lag5 +
             gov_count_lag6 +
             gov_count_lag7 +
             gov_count_lag8 +
             gov_count_lag9 +
             gov_count_lag10 +
             technology_class + year,
  data = model.data,
  cluster = c("technology_class")
)

summary(ppml_model)

nobs(ppml_model)
```

```{r}
ppml_model$pseudo_r2

```


```{r}
library(fixest)

# Define lag variable names
lags_0_5 <- paste0("gov_count_lag", 0:5)

# Extract coefficients and variance-covariance matrix
coefs <- coef(ppml_model)
vcov_mat <- vcov(ppml_model)

# Compute cumulative effect and standard error
beta <- sum(coefs[lags_0_5])
se <- sqrt(sum(vcov_mat[lags_0_5, lags_0_5]))

# Compute confidence interval
ci_lower <- beta - 1.96 * se
ci_upper <- beta + 1.96 * se

# Output
cumulative_5yr <- data.frame(
  cumulative_effect = beta,
  std_error = se,
  ci_lower = ci_lower,
  ci_upper = ci_upper
)

cumulative_5yr

```



```{r}


# Storage for AIC and BIC values
aic_bic_results <- data.frame(
  lags = integer(),
  AIC = numeric(),
  BIC = numeric()
)

# Loop over lag structures
for (k in 0:10) {
  # Construct lag terms
  lag_terms <- paste0("gov_count_lag", 0:k, collapse = " + ")
  
  # Construct the full formula
  full_formula <- as.formula(paste0(
    "us_count ~ ", lag_terms, " | technology_class + year"
  ))
  
  # Estimate model using fepois
  model_k <- fepois(
    fml = full_formula,
    data = model.data,
    cluster = "technology_class"
  )
  
  # Collect AIC and BIC
  aic_bic_results <- rbind(aic_bic_results, data.frame(
    lags = k,
    AIC = AIC(model_k),
    BIC = BIC(model_k)
  ))
}

# Display results
print(aic_bic_results)


```

```{r}
library(xtable)
xtable(aic_bic_results, caption = "Model Fit by Lag Structure (PPML)", digits = 2)

```



```{r}

lag_coefs <- tidy(ppml_model) %>%
  filter(term %in% paste0("gov_count_lag", 0:10)) %>%
  mutate(lag = as.integer(gsub("gov_count_lag", "", term)))

ggplot(lag_coefs, aes(x = lag, y = estimate)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = estimate - 1.96 * std.error,
                    ymax = estimate + 1.96 * std.error,
                    width = 0.2)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_x_continuous(breaks = 0:10) +
  theme_clean() +
  labs(title = "",
       x = "Lag (Years)", y = "Coefficient Estimate")


width <- 8 

ggsave("ppml.png", width = width, height = width/1.618, units = "in")



```


